# === Replication data for Bureaucratic Responsiveness in Humanitarian Aid, Main Document === #
### This file is used to replicate all figures and tables found in the main document
### See the two additional scripts for replication of appendix figures and tables

#load environment
library(dplyr)
library(ggplot2)
library(fixest)
library(broom)
library(mice)
library(miceadds)
library(purrr)
library(tidyr)
library(margins)
library(interactions)

#load the data set and transform variables
data <- read.csv(file = "maindata_fpa_replication.csv")

data$affected_ofda[data$affected_ofda == 0] <- NA

numeric_vars <- sapply(data, is.numeric)

for (var in names(data)[numeric_vars]) {
  temp <- data[[var]]
  
  temp[temp == 0] <- 0.1
  
  if (var == "lag_trade_bal") {
    data[[paste0("log_", var)]] <- sign(temp) * log(abs(temp))
  } else {
    temp[temp < 0 | is.na(temp)] <- NA
    data[[paste0("log_", var)]] <- log(temp)
  }
}

vars_to_center <- c("log_med_h", "log_med_words", "log_ch_words", "log_affected_ofda", 
                    "log_lag_trade_bal", "log_lag_adj_gdp", "log_lag_pop")

data <- data %>%
  mutate(across(all_of(vars_to_center), ~ as.numeric(scale(. , center = TRUE, scale = FALSE)), .names = "centered_{.col}"))


#run the regression using mice
data <- data %>% select(-affected_ofda, -lag_trade_bal,
                        -lag_adj_gdp, -lag_pop, -troops, 
                        -ch_words, -med_words, -med_h, -adj_ofda, -log_troops)

tempData <- mice(data, m=5, maxit=50, meth='pmm', seed=500, print = FALSE)
datlist <- mids2datlist(tempData)

mod1 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               case + factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod1, coef)
vars <- lapply(mod1, vcov)
sum1 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

mod2 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words+ 
               centered_log_med_h +
               centered_log_affected_ofda + 
               case + factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod2, coef)
vars <- lapply(mod2, vcov)
sum2 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)


mod3 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal +
               case + factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod3, coef)
vars <- lapply(mod3, vcov)
sum3 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)


mod4 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               case + factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod4, coef)
vars <- lapply(mod4, vcov)
sum4 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)


mod5 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               centered_log_lag_pop +
               case + factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod5, coef)
vars <- lapply(mod5, vcov)
sum5 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

mod6 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               centered_log_lag_pop +
               centered_log_ch_words +
               case + factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod6, coef)
vars <- lapply(mod6, vcov)
sum6 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

process_summary <- function(sum_table, model_name) {
  df <- data.frame(
    term = rownames(sum_table),           
    results = sum_table[, "results"],    
    se = sum_table[, "se"],       
    p = sum_table[, "p"],         
    model = model_name                    
  ) %>%
    filter(!grepl("case|year", term)) %>%
    mutate(
      # Append significance markers to results
      results = case_when(
        p < 0.01 ~ paste0(results, "***"),
        p < 0.05 ~ paste0(results, "**"),
        p < 0.101 ~ paste0(results, "*"),
        TRUE ~ as.character(results)
      )
    ) %>%
    select(-p)  # Remove p-values column
  
  return(df)
}

summary_tables <- list(sum1, sum2, sum3, sum4, sum5, sum6)
model_names <- c("Model 1", "Model 2", "Model 3", "Model 4", "Model 5", "Model 6")
combined_data <- map2_df(summary_tables, model_names, process_summary)

formatted_table <- combined_data %>%
  pivot_wider(names_from = model, values_from = c(results, se))


# === Table 1. Estimated Coefficients for CE Aid Allocations === #
print(formatted_table)

#calculate r-squared
calc_r_squared <- function(model_list, datlist, formula) {
  r2_values <- sapply(1:length(datlist), function(i) {
    model <- model_list[[i]]
    data <- datlist[[i]]
    
    complete_data <- model.frame(as.formula(formula), data = data, na.action = na.omit)
    
    y <- complete_data$log_adj_ofda
    
    X <- model.matrix(as.formula(formula), data = complete_data)
    
    beta <- coef(model)
    
    common_vars <- intersect(names(beta), colnames(X))
    X <- X[, common_vars, drop = FALSE]
    beta <- beta[common_vars]
    
    y_hat <- X %*% beta  
    
    ss_total <- sum((y - mean(y, na.rm = TRUE))^2)
    ss_residual <- sum((y - y_hat)^2)
    1 - (ss_residual / ss_total)
  })
  
  # Pooling R^2 using Rubin�fs Rules
  m <- length(r2_values)
  r_bar <- mean(r2_values)
  b_var <- var(r2_values)
  w_var <- mean((r2_values - r_bar)^2)
  total_var <- ((m + 1) / m) * b_var - (1 / m) * w_var
  
  lower_ci <- r_bar - 1.96 * sqrt(total_var)
  upper_ci <- r_bar + 1.96 * sqrt(total_var)
  
  return(list(R_squared = r_bar, CI = c(lower_ci, upper_ci)))
}


models <- list(mod1, mod2, mod3, mod4, mod5, mod6)
formulas <- list(
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + case + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    case + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + 
    case + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
    case + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
    centered_log_lag_pop + case + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
    centered_log_lag_pop + centered_log_ch_words +
    case + factor(year) +
    centered_log_med_h * centered_log_med_words
)

r2_results <- lapply(seq_along(models), function(i) {
  cat("\nCalculating R2 for Model", i, "...\n")
  calc_r_squared(models[[i]], datlist, formulas[[i]])
})

r2_df <- data.frame(
  Model = paste0("mod", 1:6),
  R_squared = sapply(r2_results, function(x) x$R_squared),
  CI_lower = sapply(r2_results, function(x) x$CI[1]),
  CI_upper = sapply(r2_results, function(x) x$CI[2])
)

# === Table 1. Estimated Coefficients for CE Aid Allocations R-Squared Values === #
print(r2_df)


#calculate marginal effects
#Mod5 will be used as the model of best fit
mod5 <- lapply(datlist, function(i) {
  lm(log_adj_ofda ~ centered_log_med_words + 
       centered_log_med_h + centered_log_affected_ofda +
       centered_log_lag_trade_bal + centered_log_lag_adj_gdp +
       centered_log_lag_pop + case + factor(year) + 
       centered_log_med_h * centered_log_med_words, data = i)
})

calculate_marginal_effect <- function(model, centered_log_med_h_level) {
  coef <- summary(model)$coefficients
  beta1 <- coef["centered_log_med_words", "Estimate"]
  beta3 <- coef["centered_log_med_words:centered_log_med_h", "Estimate"]
  
  marginal_effect <- beta1 + beta3 * centered_log_med_h_level
  return(marginal_effect)
}


marginal_effects_list <- lapply(mod5, function(model) {
  mean_log_med_h <- mean(model$model$centered_log_med_h)
  sd_log_med_h <- sd(model$model$centered_log_med_h)
  
  list(
    mean = calculate_marginal_effect(model, mean_log_med_h),
    mean_minus_sd = calculate_marginal_effect(model, mean_log_med_h - sd_log_med_h),
    mean_plus_sd = calculate_marginal_effect(model, mean_log_med_h + sd_log_med_h)
  )
})

extract_effects <- function(effect_list, level) {
  sapply(effect_list, function(e) e[[level]])
}

mean_effects <- extract_effects(marginal_effects_list, "mean")
mean_minus_sd_effects <- extract_effects(marginal_effects_list, "mean_minus_sd")
mean_plus_sd_effects <- extract_effects(marginal_effects_list, "mean_plus_sd")

average_mean_effect <- mean(mean_effects)
average_mean_minus_sd_effect <- mean(mean_minus_sd_effects)
average_mean_plus_sd_effect <- mean(mean_plus_sd_effects)

base_effects <- c(average_mean_minus_sd_effect, average_mean_effect, average_mean_plus_sd_effect)

marginal_effects_df <- data.frame(
  Level = c("Mean - SD", "Mean", "Mean + SD"),
  Effect_1pct   = base_effects,
  Effect_10pct  = base_effects * 10,
  Effect_50pct  = base_effects * 50,
  Effect_100pct = base_effects * 100,
  Effect_250pct = base_effects * 250
)

# === Table 2. Marginal Effects of Media Volume on Aid at Different Levels of Media Diversity === #
print(marginal_effects_df)


#visualize interaction effects between main independent variables
#Use mod5 as the model of best fit
mod5_rep <- mod5[[1]]
mean_h <- mean(datlist[[1]]$centered_log_med_h, na.rm = TRUE)
sd_h <- sd(datlist[[1]]$centered_log_med_h, na.rm = TRUE)


# === Figure 1 === #
interact_plot(mod5_rep, 
              pred = centered_log_med_words, 
              modx = centered_log_med_h, 
              modx.values = c(mean_h - 2*sd_h, mean_h, mean_h + 2*sd_h), 
              interval = FALSE, 
              modx.labels = c("-2 SD", "Mean", "+2 SD"),
              main.title = "Interaction Plot",
              x.label = "Log Media Volume (Centered)",
              y.label = "Log Adjusted OFDA",
              legend.main = "Log Media Diversity (Centered)")

